In [51]:
from bokeh.io import show, output_notebook, output_file
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, FactorRange, Legend, HoverTool, DatetimeTickFormatter, BoxAnnotation, Toggle, CheckboxButtonGroup,CheckboxGroup
from bokeh.models import Label
from bokeh.transform import dodge
from bokeh.layouts import layout, gridplot, row, column, widgetbox
from bokeh.models.widgets import Tabs, Panel
from matplotlib.pyplot import viridis
import bokeh.palettes
from bokeh.embed import file_html
from bokeh.resources import CDN
from IPython.display import HTML
output_notebook()
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import json
from branca.colormap import linear
from folium.features import DivIcon
import folium
import math
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

HTML('''<script>
$('div.input').hide('500')
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Display code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Display code"></form>''')


    
Loading BokehJS ...
Out[51]:
In [2]:
#from IPython.core.display import display, HTML
#display(HTML("<style>.rendered_html { font-size: 25px; }</style>"))
In [3]:
%%HTML
<style>
div.prompt {display:none}
</style>

Transport vs covid-19

02806 Social data analysis and visualization

Technical University of Denmark

Introduction

This slideshow will investigate how the covid-19 lockdown in Denmark has effected public transport in the Urban Area of Copenhagen? This will be done by answering the following questions:

  • Is the development of public transport different from municipality to municipality?
  • If so, can the demographic of the municipalities explain the difference?
    • How is age, education level, unemployment rate and disposable income distributed between the municipalities?
    • How do they correlate with the development of public transport.

Development of average speed per day

In [4]:
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'

allspeed = pd.read_csv(os.path.join(path,"speedPerDay.csv"),sep=";",index_col=0)

allspeed.index = pd.to_datetime(allspeed.index)

source = ColumnDataSource(allspeed)

p = figure(plot_width=775, plot_height=300, x_axis_type="datetime",toolbar_location=None,
          y_range=(25,32)
          )
p.title.text = 'Avg. speed of public busses per day in Copenhagen'

color = bokeh.palettes.Category20[3]

legend_it = []

r = p.line("Date",
           "Hast",
           line_width=2,
           color=color[1],
           alpha=0.8,
           muted_color=color[1],
           muted_alpha=0.2,
           visible=True,
           source=source)

p.yaxis.axis_label = 'km/t'


hover = HoverTool(tooltips=[
    ('Speed: ', '@Hast km/t'),
    ("Date: ", '@Date{%d-%m}'),
    ("Day: ",'@DayOfWeek')],
    formatters = {'@Date':'datetime'},
    renderers=[r],
    mode="vline")

p.add_tools(hover)

lockdown1_start = pd.to_datetime('20200311')
lockdown1_end = pd.to_datetime('20200318')

lockdown2_start = pd.to_datetime('20200318')
lockdown2_end = pd.to_datetime('20200415')

easter_start = pd.to_datetime('20200404')
easter_end = pd.to_datetime('20200412')

reopening_start = pd.to_datetime('20200415')
reopening_end = pd.to_datetime('20200421')


lockdown1 = BoxAnnotation(left=lockdown1_start, right=lockdown1_end, 
                          fill_color='red', fill_alpha=0.1,visible=True)
lockdown2 = BoxAnnotation(left=lockdown2_start, right=lockdown2_end, 
                           fill_color='red', fill_alpha=0.2,visible=True)

easter = BoxAnnotation(left=easter_start, right=easter_end, 
                       fill_color='yellow', fill_alpha=0.2,visible=True)

reopening = BoxAnnotation(left=reopening_start, right=reopening_end, 
                       fill_color='green', fill_alpha=0.1,visible=True)

p.add_layout(lockdown1)
p.add_layout(lockdown2)
p.add_layout(easter)
p.add_layout(reopening)

#toggle1 = Toggle(label="Lockdown vol. 1", button_type="success", active=True)
#toggle1.js_link('active', lockdown1, 'visible')

#toggle2 = Toggle(label="Lockdown vol. 2", button_type="success", active=True)
#toggle2.js_link('active', lockdown2, 'visible')

#toggle3 = Toggle(label="Easter Break",button_type="success",active=True)
#toggle3.js_link('active', easter, 'visible')

#toggle4 = Toggle(label="Denmarks start reopening",button_type="success",active=True)
#toggle4.js_link('active', reopening, 'visible')

#show(layout([p], [toggle1,toggle2,toggle4]))

lockdown1_cit = Label(x=235, y=230, x_units='screen', y_units='screen',
                 text='First step of lockdown', render_mode='canvas',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0,
                 text_font_size='8pt')

lockdown2_cit = Label(x=280, y=210, x_units='screen', y_units='screen',
                 text='Second step of lockdown', render_mode='canvas',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0,
                 text_font_size='8pt')

easter_cit = Label(x=485, y=230, x_units='screen', y_units='screen',
                 text='Easter Break', render_mode='canvas',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0,
                 text_font_size='8pt')

opening_cit = Label(x=600, y=230, x_units='screen', y_units='screen',
                 text='Opening', render_mode='canvas',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0,
                 text_font_size='8pt')


p.add_layout(lockdown1_cit)
p.add_layout(lockdown2_cit)
p.add_layout(easter_cit)
p.add_layout(opening_cit)

show(p)

The amount of public transport can be measured on the average speed (km/t) of public busses per day. It can be seen from when the lockdown starter the 13. March that the average speed generally got higher. Especially from the 28/03 to the 13/04 is the average speed consistently high until a bit before the start of the reopening the 15/04.

Try to hover over the graphic. Can you figure out why the average speed varies as a wave?

Different increase of speed in municipalities

The below choropleth map shows the percentage increase of speed during the lockdown for each municipality in the Urban Area of Copenhagen. First of all, it can be included that the speed have increased in all municipalities but that the increase varies from 0.14% in Høje-Taastrup to 8.8% in Rødovre. One intuitive explanation could be that the municipalities with motorways going through would have bigger increases because there not would be any lines on the motorway. However, that does not seem to be the case because there for instance runs motorways through Rødovre and Høje-Taastrup which have the highest and lowest increase.

You can try to verify by yourself by removing and applying the choropleth map in the bottom right corner and seeing where the motorway (the red lines) go through the city.

In [5]:
import json

sel_mun = ["Albertslund","Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
                           "Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
                           "Rødovre","Tårnby","Vallensbæk"]

path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'

with open(os.path.join(path,"big_cph_lon_lat.json")) as f:
    big_cph = json.load(f)
    

speed = pd.read_csv(os.path.join(path,"speedPerMun.csv"),sep=";")

speed = speed.set_index("Date")

before_lockdown = speed[speed.index < "2020-03-11"]
before_lockdown = pd.DataFrame(before_lockdown.mean(),columns=["avg_speed_feb"])
before_lockdown = before_lockdown.reset_index()
before_lockdown = before_lockdown.rename(columns={"index":"Kommune"})

after_lockdown = speed[speed.index >= "2020-03-11"]
after_lockdown = pd.DataFrame(after_lockdown.mean(),columns=["avg_speed_mar"])
after_lockdown = after_lockdown.reset_index()
after_lockdown = after_lockdown.rename(columns={"index":"Kommune"})

avg_speed = before_lockdown.merge(after_lockdown,on="Kommune")
avg_speed = avg_speed.set_index("Kommune")

avg_speed["per_increase"] = ((avg_speed.avg_speed_mar-avg_speed.avg_speed_feb)/avg_speed.avg_speed_feb)*100

mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=11,tiles='OpenStreetMap')

colormap = linear.YlOrRd_04.scale(avg_speed["per_increase"].min(),avg_speed["per_increase"].max())

df_dict = avg_speed["per_increase"]

folium.GeoJson(
    big_cph,
    name='Percentage increase in speed after lockdown',
    style_function=lambda feature: {
        'fillColor': colormap(df_dict[feature['properties']['KOMNAVN']]),
        'color': 'black',
        'weight': 1,
        #'dashArray': '5, 5',
        'fillOpacity': 1,
    }
).add_to(mun_map)

folium.LayerControl(position="bottomright",collapsed=False).add_to(mun_map)

colormap.caption = 'Percentage increase in speed'
colormap.add_to(mun_map)

coordinates = [[55.693587,12.344985],[55.734238,12.364346],
               [55.653268,12.418906],[55.592974,12.645970],
               [55.682220,12.514292],[55.753021,12.552256],
               [55.747834,12.472433],[55.671153,12.398190],
               [55.735531,12.434457],[55.634567,12.465933],
               [55.665301,12.266118],[55.623098,12.330763],
               [55.711402,12.533741],[55.682886,12.448000],
               [55.610549,12.600898],[55.636898,12.368548]]

coordinates_dict = dict(zip(sel_mun,coordinates))

for m in sel_mun:
    folium.map.Marker(
        coordinates_dict[m],
        icon=DivIcon(
            icon_size=(150,36),
            icon_anchor=(25,0),
            html=f'<div style="font-size: 10pt">{m}</div>',
            )
        ).add_to(mun_map)

mun_map
Out[5]:
In [6]:
#Plan B:
#<img src="https://tuelindhart.github.io/Transport_vs_covid19/Images/Choropleth_speed.png" style="width: 800px;"/>

Hover over the bar graph to see the exact percentage increase in speed

In [7]:
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'

focus_municipality = ["Albertslund","Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
                           "Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
                           "Rødovre","Tårnby","Vallensbæk"]

df_hast_daily = pd.read_csv(os.path.join(path,"speedPerMun.csv"),sep=";")
df_hast_daily = df_hast_daily.set_index("Date")

df_avg_hast_pre_lockdown = df_hast_daily[df_hast_daily.index < "2020-03-11"].mean(0)
df_avg_hast_post_lockdown = df_hast_daily[df_hast_daily.index >= "2020-03-11"].mean(0)

df_avg_hast = pd.DataFrame((df_avg_hast_post_lockdown - df_avg_hast_pre_lockdown)/df_avg_hast_pre_lockdown,
                             columns=["pct_increase"])*100

df_avg_hast = df_avg_hast.reset_index()
df_avg_hast.columns = ["Municipality","pct_increase"]
df_avg_hast = df_avg_hast.set_index("Municipality")

#Creating format fitting for bokeh
source = ColumnDataSource(df_avg_hast)

#Creating a list for the x-axis on the bar-chart. 
x_range = [str(h) for h in source.data['Municipality']]

#Generating 14 colors for the different focus crimes  
color = bokeh.palettes.Category20[20][2]

bar ={} # to store vbars

#Creating figure frame with title, x -and y labels and disabling toolbar. 
p = figure(x_range=FactorRange(factors=x_range),
           title='Increase in speed per municipality',
           x_axis_label='Municipality',
           y_axis_label='Increase in speed (%)',
           plot_width=700,plot_height=400)


r = p.vbar("Municipality",  top="pct_increase", source= source, width=0.8, 
             fill_alpha=0.5,line_color=None,
             color=color,visible=True)

p.xaxis.major_label_orientation = math.pi/4

mun = r.name #extracting the name of each rendered crime
hover = HoverTool(tooltips=[
    ('Increase in speed (%)' , '@pct_increase{0.2f} %') #@ - extracts the fractions from each focuscrime
], renderers=[r]) # assigns the hoved effect to the correct bar rendering

p.add_tools(hover)
    
p.y_range.start = 0 #setting the y range from zero

show(p)

If you are interested in how the speed have evolved per day for each municipality before and during the lock you can click here

Development of passengers in public busses

The amount of public transport can also be measured by the average number of passengers per municipality boarding busses. Remember, that these numbers only reflect the drop in transport with public busses. For example, this means that the supposed large drop in passengers going to airport not will show in the numbers.

On the barchart below is the avg. number of boarding passengers per municipality in the periods February 2019, March 2019, February 2020 and March 2020. If you click on the names February 2019 and March 2019 then you can see that there are most boarding passengers in Frederiksberg and Copenhagen while there is few in Vallensbæk and Dragør. You can also see that the number of avg. passengers in February and March are almost equal in 2019.

Now try to click on February 2020 and March 2020. Now the number of passengers dropped in all municipalities and some places it seems to be cut in half! This indicates that the drop in passengers is because of the lockdown and not an seasonal effect.

But how about yearly effects? The opening of the new metro in Copenhagen could quite possibly have an effect on the number of passengers using busses. If you click on February 2019 and February 2020 then you can see that the number of passengers stay mainly the same except in Frederiksberg, Copenhagen (København), Gladsaxe and Høje-Taastrup where it is very likely the drop in passengers are due to the new metro.

In [8]:
sel_mun = ["Albertslund","Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
                           "Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
                           "Rødovre","Tårnby","Vallensbæk"]

path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/' + 'Passagertal_SenesteVersion.csv'

df = pd.read_csv(path,sep=";",
                 thousands=",")

df_201902 = df[["Kommune","På_201902","Af_201902"]].dropna()
df_201903 = df[["Kommune","På_201903","Af_201903"]].dropna()
df_202002 = df[["Kommune","På_202002","Af_202002"]].dropna()
df_202003 = df[["Kommune","På_202003","Af_202003"]].dropna()

passenger_df = df_201902.groupby(["Kommune"]).mean().loc[sel_mun]

passenger_df = passenger_df.join(df_201903.groupby(["Kommune"]).mean().loc[sel_mun],
                                 on="Kommune")
passenger_df = passenger_df.join(df_202002.groupby(["Kommune"]).mean().loc[sel_mun],
                                 on="Kommune")
passenger_df = passenger_df.join(df_202003.groupby(["Kommune"]).mean().loc[sel_mun],
                                 on="Kommune")



#Creating format fitting for bokeh
source = ColumnDataSource(passenger_df)

#Creating a list for the x-axis on the bar-chart. 
x_range = [str(h) for h in source.data['Kommune']]

#Generating 14 colors for the different focus crimes  
colors = bokeh.palettes.Category20[8][0::2]

bar ={} # to store vbars

#Creating figure frame with title, x -and y labels and disabling toolbar. 
p = figure(x_range=FactorRange(factors=x_range),
           title='Barchart of average number of onboarding passengers',
           x_axis_label='Municipalities',
           y_axis_label='Avg. number of boarding passengers',
           plot_width=950,plot_height=400,
           toolbar_location=None)

#List of on - and offboarding passengers in february and march for 2019 and 2020
boardings = ["På_201902","På_201903","På_202002","På_202003"]


#Generating barcharts to p 
for indx,i in enumerate(boardings):
    bar[i] = p.vbar("Kommune",  top=i, source= source, width=0.8, 
                 fill_alpha=0.5,line_color=None,
                 color=colors[indx],visible=False)
    
p.xaxis.major_label_orientation = math.pi/4

legend_names = {"På_201902":"February 2019","På_201903":"March 2019",
                "På_202002":"February 2020","På_202003":"March 2020",}

items = [(legend_names[i],[bar[i]]) for i in boardings] #Creating list of tuples with focus crime name and belonging bar chart.
legend = Legend(items=items,location=(0,152)) # Creating legends with 'items'
p.add_layout(legend,'right') # Adding legends to ´p´ and setting location. 

for b in bar:
    period = legend_names[b] #extracting the name of each rendered crime
    hover = HoverTool(tooltips=[
        # The name of the crime
        ('Avg. number of passengers: ' , '@{%s}' %b) #@ - extracts the fractions from each focuscrime
    ], renderers=[bar[b]]) # assigns the hoved effect to the correct bar rendering
    p.add_tools(hover)
    
p.y_range.start = 0 #setting the y range from zero

p.legend.click_policy="hide"
p.legend.title = 'Click on names below to see barcharts'
show(p)

Comparing fall in passengers

However, it can be challenging to see on the barchart how much the number of passengers have dropped in each municipality. The barchart below can give a better intuition about this. The bar chart shows the ration between passengers in March divided by passengers in February in the corresponding year. The ratio between March 2019 and March 2020 is not compared because of the difference in passengers between February 2019 and 2020 for Frederiksberg, Copenhagen etc.

So if you click on Mar 2020 / Feb 2020 and hover over Vallensbæk you see that 68.86% passengers in March 2020 use the busses in relation to February 2020. For good measure you can compare it with the previous year Mar 2019/Feb 2019 and see that amount of passengers in 2019 roughly are the same.

In [9]:
dif_df = pd.DataFrame()

dif_df["dif_2019"] = (passenger_df.På_201903-passenger_df.På_201902).div(passenger_df.På_201902)*100
dif_df["dif_2020"] = (passenger_df.På_202003-passenger_df.På_202002).div(passenger_df.På_202002)*100
dif_df["per_2019"] = (passenger_df.På_201903).div(passenger_df.På_201902)*100
dif_df["per_2020"] = (passenger_df.På_202003).div(passenger_df.På_202002)*100
dif_df["per_feb"] = (passenger_df.På_201902-passenger_df.På_202002).div(passenger_df.På_201902)*100
dif_df["per_mar"] = (passenger_df.På_201903-passenger_df.På_202003).div(passenger_df.På_201903)*100


#Creating format fitting for bokeh
source = ColumnDataSource(dif_df)

#Creating a list for the x-axis on the bar-chart. 
x_range = [str(h) for h in source.data['Kommune']]

#Generating 14 colors for the different focus crimes  
colors = bokeh.palettes.Category20[8][0::2]

bar ={} # to store vbars

#Creating figure frame with title, x -and y labels and disabling toolbar. 
p = figure(x_range=FactorRange(factors=x_range),
           title='Passengers in march in relation to february in 2019 and 2020',
           x_axis_label='Municipalities',
           y_axis_label='% passengers in March in relation to February',
           plot_width=950,plot_height=400,
           toolbar_location=None)

#List of on - and offboarding passengers in february and march for 2019 and 2020
years = ["per_2019","per_2020"]


#Generating barcharts to p 
for indx,i in enumerate(years):
    bar[i] = p.vbar("Kommune",  top=i, source= source, width=0.8, 
                 fill_alpha=0.5,line_color=None,
                 color=colors[indx],visible=False)
    
p.xaxis.major_label_orientation = math.pi/4

legend_names = {"per_2019":"Mar 2019 / Feb 2019",
                "per_2020":"Mar 2020 / Feb 2020"}

items = [(legend_names[i],[bar[i]]) for i in years] #Creating list of tuples with focus crime name and belonging bar chart.
legend = Legend(items=items,location=(0,198)) # Creating legends with 'items'
p.add_layout(legend,'right') # Adding legends to ´p´ and setting location. 

for b in bar:
    period = legend_names[b] #extracting the name of each rendered crime
    hover = HoverTool(tooltips=[
        # The name of the crime
        ('% passengers in March in relation to February: ' , '@{%s}{0.2f}' %b) #@ - extracts the fractions from each focuscrime
    ], renderers=[bar[b]]) # assigns the hoved effect to the correct bar rendering
    p.add_tools(hover)

p.legend.click_policy="hide"
p.legend.title = 'Click on names below to see barcharts'
show(p)

Can the difference be due to the location of the municipalities. Below is a choropleth map showing the percentage decrease from February 2020 to March 2020. Surprisingly, the municipality with the biggest decrease is Albertslund followed Copenhagen (København) and Frederiksberg. København and Frederiksberg can be explained by there run many busses and it often is possible to take the bike instead of a bus. However, it is more challenging to bike into the from Albertslund.

In [10]:
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'

df = pd.read_csv(os.path.join(path,"passenger_decrease.csv"),index_col="Kommune")

with open(os.path.join(path,"big_cph_lon_lat.json")) as f:
    big_cph = json.load(f)
    
mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=11,tiles='OpenStreetMap')


from branca.colormap import linear
from folium.features import DivIcon

colormap = linear.YlOrRd_04.scale(df["pct_drop_2020"].min(),df["pct_drop_2020"].max())

df_dict = df["pct_drop_2020"]

mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=10.5,tiles='OpenStreetMap')

folium.GeoJson(
    big_cph,
    name='Percentage decrease for boarding passengers',
    style_function=lambda feature: {
        'fillColor': colormap(df_dict[feature['properties']['KOMNAVN']]),
        'color': 'black',
        'weight': 1,
        #'dashArray': '5, 5',
        'fillOpacity': 1,
    }
).add_to(mun_map)

folium.LayerControl(position="bottomright",collapsed=False).add_to(mun_map)

colormap.caption = 'Percentage decrease from march 2019 to march 2020'
colormap.add_to(mun_map)

coordinates = [[55.693587,12.344985],[55.734238,12.364346],
               [55.653268,12.418906],[55.592974,12.645970],
               [55.682220,12.514292],[55.753021,12.552256],
               [55.747834,12.472433],[55.671153,12.398190],
               [55.735531,12.434457],[55.634567,12.465933],
               [55.665301,12.266118],[55.623098,12.330763],
               [55.711402,12.533741],[55.682886,12.448000],
               [55.610549,12.600898],[55.636898,12.368548]]

coordinates_dict = dict(zip(sel_mun,coordinates))

for m in sel_mun:
    
    folium.map.Marker(
        coordinates_dict[m],
        icon=DivIcon(
            icon_size=(150,36),
            icon_anchor=(25,0),
            html=f'<div style="font-size: 10pt">{m}</div>',
            )
        ).add_to(mun_map)

    
mun_map


colormap = linear.YlOrRd_04.scale(df["pct_drop_2020"].min(),df["pct_drop_2020"].max())

df_dict = df["pct_drop_2020"]

mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=10.5,tiles='OpenStreetMap')

folium.GeoJson(
    big_cph,
    name='Percentage decrease for boarding passengers',
    style_function=lambda feature: {
        'fillColor': colormap(df_dict[feature['properties']['KOMNAVN']]),
        'color': 'black',
        'weight': 1,
        #'dashArray': '5, 5',
        'fillOpacity': 1,
    }
).add_to(mun_map)

folium.LayerControl(position="bottomright",collapsed=False).add_to(mun_map)

colormap.caption = 'Percentage decrease from February 2020 to March 2020'
colormap.add_to(mun_map)

coordinates = [[55.693587,12.344985],[55.734238,12.364346],
               [55.653268,12.418906],[55.592974,12.645970],
               [55.682220,12.514292],[55.753021,12.552256],
               [55.747834,12.472433],[55.671153,12.398190],
               [55.735531,12.434457],[55.634567,12.465933],
               [55.665301,12.266118],[55.623098,12.330763],
               [55.711402,12.533741],[55.682886,12.448000],
               [55.610549,12.600898],[55.636898,12.368548]]

coordinates_dict = dict(zip(sel_mun,coordinates))

for m in sel_mun:
    
    folium.map.Marker(
        coordinates_dict[m],
        icon=DivIcon(
            icon_size=(150,36),
            icon_anchor=(25,0),
            html=f'<div style="font-size: 10pt">{m}</div>',
            )
        ).add_to(mun_map)

    
mun_map
Out[10]:

Linear regression with demographic

Explaining % change in speed with linear regression

The data shows that the increase in speed and decrease in passengers are different in the municipalities. The question is if the demographic of the municipalities can describe these differences. One way to explore this is by plotting a scatter plot with the explaining variable on the x-axis and the variable you want to explain on the y-axis. In this way it is possible to spot trend and relashionships between two variables.

Below there is 4 scatter plots with the percentage increase in speed plotted on the y-axis. The 4 demographic variables are:

  • Population Density calculated by $\frac{P_{mun}}{km^2_{mun}}$ where P denotes number of people living in the municipality and $km^2_{mun}$ denotes the square-kilometers for each municipality. Thus, density describes how densily populated a municipality is.
  • Day/Night Ratio shows the ratio between the night and day population which is calculated by $\frac{D_{mun}}{N_{mun}}$ where $N_{mun}$ is the number of people in the municipality during night and $D_{mun}$ is the number of people in the municipality during day hours. Thus, the ratio indicates how many people work and study outside their own municipality with a low ratio denoting many people work outside their municipality and vice versa. Omforklar til: Percent people who both are in the municipality day and night
  • % with high education denotes the percentage of people with a high education in each municipality where a bachelor, masters og PhD degree are counted as high educations and people attending primary school are excluded because most of these will be children.
  • & with non-vestern background is the percentage of people in each municipality which are non-vestern immigrants or are children of non-vestern immigrants.

Normally when looking looking after a correlation it is preferable to have more data points than 16 because you can risk with few data points that a trend is random. Therefore remember these explorations of the variables only can give an indication of possible trends.

Try to look at the 4 graphs and see if you think some of the variables can explain the different increase in speed in each municipality. You can hover over the graphs and see which municipality it is and what there exact values are on the x -and y axis. You can also mark a data point and it will highlight where it is positioned on the other graphs. Can you find the position of Høje-Taastrup on all the graphs. Do you think it is an outlier.

Also,have you noticed that the x-axis for density is logarithmic? That is because the relashionship because the relashionship between % change in speed and the density of population seems to be logarithmic which is $y = log(x)$. If the relashionship is logarithmic, a logarithm transformation on the x-axis makes the relashionship appear linear!

In [11]:
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'

df = pd.read_csv(os.path.join(path,"allVariables.csv"),index_col="Municipality")

df = df[["Density","DagNatRatio","pct_highly_educated","pct_not_vestern","speed_change"]]
df["pct_highly_educated"] = round(df["pct_highly_educated"]*100,2)
df["pct_not_vestern"] = round(df["pct_not_vestern"]*100,2)

df["log_Density"] = np.log(df["Density"])

#df = df[df.index != "Høje-Taastrup"]

# Calculating linear regression lines #################################################################################

linear_regressor = LinearRegression()

df_temp = pd.DataFrame()

for x in df.columns[:-1]:
    
    if x == "Density":
        X = df[x].to_numpy().reshape(-1,1)
        Y = df.speed_change.to_numpy().reshape(-1,1)
        X = np.log(X)
        
    else:
        X = df[x].to_numpy().reshape(-1,1)
        Y = df.speed_change.to_numpy().reshape(-1,1)
    
    
    linear_regressor.fit(X,Y)
    
    Y_pred = linear_regressor.predict(X)
    
    Y_pred = list(Y_pred.squeeze())
    
    name = "prediction_" + x
    
    df_temp[name] = Y_pred

df_temp = df_temp.set_index(df.index)

df = df.join(df_temp,on="Municipality")

########### Create figure #########################################################################################################
source = ColumnDataSource(data = df)

f = [0]*4
toggle = [0]*8

var_dict = {"Density":"Log Density (persons/km^2)",
            "DagNatRatio":"Day/Night Ratio",
            "pct_highly_educated":"% with high education",
            "pct_not_vestern":"% with non-vestern background"}

for i,var in enumerate(df.columns[:4]):
    
    name = var_dict[var]
    
    TOOLTIPS = [("Municipality: ", "@Municipality"),
            (name, f"@{var}"),
            ("Increase in speed: ","@speed_change{%0.2f}")]
    
    if var == "Density":
        x_axis_type = "log"
    else:
        x_axis_type = "linear"

    # create figure
    name = var_dict[var]
    f[i] = figure(plot_width=400,plot_height=350,
        title = f'Speed increase vs. {name}' ,
               x_axis_label=f'{name}',
               y_axis_label='% increase in speed',
               background_fill_color = "white",background_fill_alpha = 0.8, 
               border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
               #y_axis_type="log",
               x_axis_type= x_axis_type,
               toolbar_location = None,
               tools = "box_select"

              )
    
    ## Apply R^2 label ##############################################################################################
    
    if var == "Density":
    
        log_name = "log_" + var
        cor = df[["speed_change",log_name]].corr()
        
    else:
        cor = df[["speed_change",var]].corr()
    

    R_sqrt = round(cor.iloc[0,1]**2,3)


    R_sqrt_Label = Label(x=200, y=5, x_units='screen', y_units='screen',
                 text=f'  R^2 value: {R_sqrt}  ', render_mode='canvas',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0,
                 text_font_size='12pt',visible=False
                 #text_font="times"
                 )
    # Create plot
    f[i].circle(x = var, y = 'speed_change', size=10,source=source)

    predict_name = "prediction_" + var
    #Create linear regression line
    line = f[i].line(x=var, y = predict_name,source=source,color="orange",line_width=2,visible=False)
    

    f[i].add_layout(R_sqrt_Label)
    
    
    toggle1 = Toggle(label="Show regression line", button_type="success", active=False,max_width=400,align="center")
    toggle1.js_link('active', line, 'visible')
    
    toggle2 = Toggle(label="Show R^2 value", button_type="success", active=False,width=400,align="center")
    toggle2.js_link('active', R_sqrt_Label, 'visible')
    
    
    #button = CheckboxButtonGroup(labels=["Show regression line", "Show R^2 value"], active=[0, 0])
    
    toggle[i] = toggle1
    toggle[i+4] = toggle2

    

toggles1 = column(toggle[0],toggle[4])
toggles2 = column(toggle[1],toggle[5])
toggles3 = column(toggle[2],toggle[6])
toggles4 = column(toggle[3],toggle[7])

l = layout([[f[0],f[1]],
           [toggles1,toggles2],
           [f[2],f[3]],
           [toggles3,toggles4]],
           
          )

show(l)

It appears that there is a linear relashionship with the log(Density) and a negative linear relashion in % with non-vestern backgrounds.

Sometimes it is helpful to use linear regression to see how well you can make a linear line fit the data. In other words we want to see how well we can predict the percentage change in speed by our 4 variables. Try to click on the 4 buttons Show regression line and see how the lines fit the data. As suspected there is a positive linear relashionship between the log(Density) and a negative linear relashionship with% with non-vestern background while there is a small correlation with % with high education and with Day/Night Ratio

However, just because the slope of the line is negative or positive it does not necessarily mean that the line predict the data well. A helpful measure is the $R^2$ with a value between 0 and 1 which basically measures how well the line explains differences in the dependent variable y (%s change in speed in our case) where 0 denotes it does not explain the data at all and 1 denotes it perfectly fits the data. Try to click on the Show R^2 value and see how well the regression lines fit the data.

The $R^2$ value for the $log(Density)$ is on 0.338 which means that 33.8% of the change in speed can be explained by the population density of the municipality. Taking into consideration that there probably are many factors in play to understand then a $R^2$ is pretty good. This intuively also makes sense because the more densily populated cities usually will have more traffic jams which when are removed will increase the speed substaintially. That the relashionship is logarithmic means that the density of the population at some point won't have the same effect on the % change of speed.

The $R^2$ value for % with non-vestern background is 0.156. This indicates that municipalities with many immigrants with non-vestern background have less increase in speed. This can be harder to explain. One possible explanation can be that people from non-vestern backgrounds have sub-cultures which does not have the same confidence in the government and therefore not follow the directions of the government in staying home as much as possible.

% with high education and Day/Night Ratio $R^2$ values on respectively 0.054 and 0.0 which indicates that they do not explain the % change in speed very well

If you are interested in understand the $R^2$ model better then you can read about it on this link.

Explaining % change in passengers using linear regression

Now we will use the same variables and explore with the same variables if the % decrease in passengers. is dependent on the same. By looking at the scatterplots do you think it actually look like that the Day/Night Ratio explains % change in passengers the best which didn't explain the increase in speed at all! Try to click on the Show regression line and Show R^2 value to see if that is the case.

The Day/Night Ratio explains 37.8% of the decrease in passengers while log(Density) and % with high education explains respectively 11.3% and 16.6% of the variation of decrease in passengers. Assuming that the Day/Night Ratio* is a good measure for how many people work or study outside their municipality then the decrease of passengers is dependent on how many that works outside the municipality. This could indicate most people take public transport when going from and to work. Also the decrease in passengers can also be explained a little by the density of the municipality with probably means that more people are taking public transport in the densily public areas while a high education means they are more likely to be able to do their work from home.

In [12]:
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/'

df = pd.read_csv(os.path.join(path,"allVariables.csv"),index_col="Municipality")

df = df[["Density","DagNatRatio","pct_highly_educated","pct_not_vestern","passenger_change"]]

df["pct_highly_educated"] = round(df["pct_highly_educated"]*100,2)
df["pct_not_vestern"] = round(df["pct_not_vestern"]*100,2)

df["log_Density"] = np.log(df["Density"])

#df = df[df.index != "Albertslund"]

# Calculating linear regression lines #################################################################################

linear_regressor = LinearRegression()

df_temp = pd.DataFrame()

for x in df.columns[:-1]:
    
    if x == "Density":
        X = df[x].to_numpy().reshape(-1,1)
        Y = df.passenger_change.to_numpy().reshape(-1,1)
        X = np.log(X)
        
    else:
        X = df[x].to_numpy().reshape(-1,1)
        Y = df.passenger_change.to_numpy().reshape(-1,1)
    
    #X = np.log(X)
    #Y = np.log(Y)
    
    linear_regressor.fit(X,Y)
    
    Y_pred = linear_regressor.predict(X)
    
    Y_pred = list(Y_pred.squeeze())
    
    name = "prediction_" + x
    
    df_temp[name] = Y_pred

df_temp = df_temp.set_index(df.index)

df = df.join(df_temp,on="Municipality")

########### Create figure #########################################################################################################
source = ColumnDataSource(data = df)

f = [0]*4
toggle = [0]*8

var_dict = {"Density":"Density (persons/km^2)",
            "DagNatRatio":"Day/Night Ratio",
            "pct_highly_educated":"% with high education",
            "pct_not_vestern":"% with non-vestern background"}

for i,var in enumerate(df.columns[:4]):
    
    if var == "Density":
        x_axis_type = "log"
    else:
        x_axis_type = "linear"
    
    name = var_dict[var]
    
    if "pct" in var.split("_"):

        TOOLTIPS = [("Municipality: ", "@Municipality"),
                (name, f"@{var} %"),
                ("Decrease in passengers: ","@passenger_change{%0.2f}")]
    else:
        TOOLTIPS = [("Municipality: ", "@Municipality"),
                (name, f"@{var}"),
                ("Decrease in passengers: ","@passenger_change{%0.2f}")]

    # create figure
    name = var_dict[var]
    f[i] = figure(plot_width=400,plot_height=350,
        title = f'Passenger decrease vs. {name}' ,
               x_axis_label=f'{name}',
               y_axis_label='% decrease in passengers',
               background_fill_color = "white",background_fill_alpha = 0.8, 
               border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
               #y_axis_type="log",
               x_axis_type=x_axis_type,
               toolbar_location = None,
               tools = "box_select"

              )
    
    ## Apply R^2 label ##############################################################################################

    if var == "Density":
    
        log_name = "log_" + var
        cor = df[["passenger_change",log_name]].corr()
        
    else:
        cor = df[["passenger_change",var]].corr()

    R_sqrt = round(cor.iloc[0,1]**2,3)


    R_sqrt_Label = Label(x=200, y=5, x_units='screen', y_units='screen',
                 text=f'  R^2 value: {R_sqrt}  ', render_mode='canvas',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0,
                 text_font_size='12pt',visible=False
                 #text_font="times"
                 )
    # Create plot
    f[i].circle(x = var, y = 'passenger_change', size=10,source=source)

    predict_name = "prediction_" + var
    #Create linear regression line
    line = f[i].line(x=var, y = predict_name,source=source,color="orange",line_width=2,visible=False)
    

    f[i].add_layout(R_sqrt_Label)
    
    
    toggle1 = Toggle(label="Show regression line", button_type="success", active=False,max_width=400,align="center")
    toggle1.js_link('active', line, 'visible')
    
    toggle2 = Toggle(label="Show R^2 value", button_type="success", active=False,width=400,align="center")
    toggle2.js_link('active', R_sqrt_Label, 'visible')
    
    
    #button = CheckboxButtonGroup(labels=["Show regression line", "Show R^2 value"], active=[0, 0])
    
    toggle[i] = toggle1
    toggle[i+4] = toggle2

    

toggles1 = column(toggle[0],toggle[4])
toggles2 = column(toggle[1],toggle[5])
toggles3 = column(toggle[2],toggle[6])
toggles4 = column(toggle[3],toggle[7])

l = layout([[f[0],f[1]],
           [toggles1,toggles2],
           [f[2],f[3]],
           [toggles3,toggles4]],
           
          )

show(l)

Do you remember that Albertslund stands out on the choropleth map? Try to find it on the scatter plot and mark it to see where it is on the other scatter plots. Do you think the variables explain Albertslund or does it appear to be an outlier?

It appears that none of the variables explains why Albertslund stands out. This can be due to some special conditions that only apply to Albertslund or mistakes in the measurements. However, Albertslund seemingly affects the fitting of the line substaintially because because we have few observations.

For fun lets say we want to predict the decrease in passengers for all municipalities except Albertslund and see how well the lines fit the data which can be seen on the graphs below.

In [13]:
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/'

df = pd.read_csv(os.path.join(path,"allVariables.csv"),index_col="Municipality")

df = df[["Density","DagNatRatio","pct_highly_educated","pct_not_vestern","passenger_change"]]

df["pct_highly_educated"] = round(df["pct_highly_educated"]*100,2)
df["pct_not_vestern"] = round(df["pct_not_vestern"]*100,2)

df["log_Density"] = np.log(df["Density"])

df = df[df.index != "Albertslund"]

# Calculating linear regression lines #################################################################################

linear_regressor = LinearRegression()

df_temp = pd.DataFrame()

for x in df.columns[:-1]:
    
    if x == "Density":
        X = df[x].to_numpy().reshape(-1,1)
        Y = df.passenger_change.to_numpy().reshape(-1,1)
        X = np.log(X)
        
    else:
        X = df[x].to_numpy().reshape(-1,1)
        Y = df.passenger_change.to_numpy().reshape(-1,1)
    
    #X = np.log(X)
    #Y = np.log(Y)
    
    linear_regressor.fit(X,Y)
    
    Y_pred = linear_regressor.predict(X)
    
    Y_pred = list(Y_pred.squeeze())
    
    name = "prediction_" + x
    
    df_temp[name] = Y_pred

df_temp = df_temp.set_index(df.index)

df = df.join(df_temp,on="Municipality")

########### Create figure #########################################################################################################
source = ColumnDataSource(data = df)

f = [0]*4
toggle = [0]*8

var_dict = {"Density":"Density (persons/km^2)",
            "DagNatRatio":"Day/Night Ratio",
            "pct_highly_educated":"% with high education",
            "pct_not_vestern":"% with non-vestern background"}

for i,var in enumerate(df.columns[:4]):
    
    if var == "Density":
        x_axis_type = "log"
    else:
        x_axis_type = "linear"
    
    name = var_dict[var]
    
    if "pct" in var.split("_"):

        TOOLTIPS = [("Municipality: ", "@Municipality"),
                (name, f"@{var} %"),
                ("Decrease in passengers: ","@passenger_change{%0.2f}")]
    else:
        TOOLTIPS = [("Municipality: ", "@Municipality"),
                (name, f"@{var}"),
                ("Decrease in passengers: ","@passenger_change{%0.2f}")]

    # create figure
    name = var_dict[var]
    f[i] = figure(plot_width=400,plot_height=350,
        title = f'Passenger decrease vs. {name}' ,
               x_axis_label=f'{name}',
               y_axis_label='% decrease in passengers',
               background_fill_color = "white",background_fill_alpha = 0.8, 
               border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
               #y_axis_type="log",
               x_axis_type=x_axis_type,
               toolbar_location = None,
               tools = "box_select"

              )
    
    ## Apply R^2 label ##############################################################################################

    if var == "Density":
    
        log_name = "log_" + var
        cor = df[["passenger_change",log_name]].corr()
        
    else:
        cor = df[["passenger_change",var]].corr()

    R_sqrt = round(cor.iloc[0,1]**2,3)


    R_sqrt_Label = Label(x=200, y=5, x_units='screen', y_units='screen',
                 text=f'  R^2 value: {R_sqrt}  ', render_mode='canvas',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0,
                 text_font_size='12pt',visible=False
                 #text_font="times"
                 )
    # Create plot
    f[i].circle(x = var, y = 'passenger_change', size=10,source=source)

    predict_name = "prediction_" + var
    #Create linear regression line
    line = f[i].line(x=var, y = predict_name,source=source,color="orange",line_width=2,visible=False)
    

    f[i].add_layout(R_sqrt_Label)
    
    
    toggle1 = Toggle(label="Show regression line", button_type="success", active=False,max_width=400,align="center")
    toggle1.js_link('active', line, 'visible')
    
    toggle2 = Toggle(label="Show R^2 value", button_type="success", active=False,width=400,align="center")
    toggle2.js_link('active', R_sqrt_Label, 'visible')
    
    toggle[i] = toggle1
    toggle[i+4] = toggle2

    

toggles1 = column(toggle[0],toggle[4])
toggles2 = column(toggle[1],toggle[5])
toggles3 = column(toggle[2],toggle[6])
toggles4 = column(toggle[3],toggle[7])

l = layout([[f[0],f[1]],
           [toggles1,toggles2],
           [f[2],f[3]],
           [toggles3,toggles4]],
           
          )

show(l)

Now all regression lines minimum explain 21% of the variation of the decrease of passengers where % with high education goes from explaining 16.6% to 42.0% while % with non-vestern background goes from not really explaining the variation in passenger decrease to explaining 21%. This could indicate that there are specific things going on in Albertslund that makes it stand out while but most of all it shows how sensitive linear regression is when you have few data points and therefore the regression lines should only be taken as indications of possible explanations.

Understanding all variables at once

We have gotten a better understanding of how a single variable alone affects the % speed and passenger change in the municipalities. However, we do not understand how the variables put together explains the % change. Also, we thought we would add some more variables which are % unemployed, disposable income and median age. which gives 7 variables in total.We cannot plot in anything more than 3D so we have to be smart about it. Luckily, Karl Pearson came up with a smart technique back in 1901 called Principal Component Analysis.

We won't go into details of how it works but basically each Principal Component (PC) shows how variables explain variations in the data. So, if you click on the PC1 on the graph below we can see that if you have a municipality with the combination above average population density, above average unemployment, above average education level and below average % of non-vestern immigrants and median age then it is likely the increase in speed will be high. So how could we see that?

Imagine that you have a checkbox in front of you with where you can give a checkmark for each variable and the more you can check off, the more likely it is the increase in speed or decrease in passengers is high.

You need to know 3 things to be able to 'check the boxes of':

  1. If a bar points in the same direction as speed change or passenger change then you need the value of the variable to be above average to check the box off. If a bar points in the opposite direction of passenger change or speed change you need to have a value below average to check the box. So same = above average and oppostive = below average
  2. The higher the bar, the more important. So in PC1, it is much more important to have high population density than Day/Night Ratio.
  3. If you can't check many boxes then it means that the increase in speed or decrease in passengers will be low.

Now we have a challenge for you. Click on PC3 for passenger decrease and answer this question. If we have a municipality with below average population density, above average of people present both day and night, below average unemployment and education level and below average median age, is the municipality then likely to have a large or small decrease in passengers? To help you we have made a checkbox to help you keep track.

In [14]:
## Prepare data ##################################################################################

variables = ["log_Density","DagNatRatio","pct_unemployed","pct_highly_educated",
             "pct_not_vestern","income","median_age",
            ]

change_variables = ["speed_change","passenger_change"]

sel_mun = [
           "Albertslund",
           "Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
                           "Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
                           "Rødovre","Tårnby","Vallensbæk"]

path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/'

df = pd.read_csv(os.path.join(path,"allVariables.csv"),sep=",",index_col=0)

df["log_Density"] = np.log(df["Density"])

p = {}

change_dict = {"speed_change":"Speed increase (%)","passenger_change":"Passenger decrease (%)"}

for c in change_variables:

    #df = df[df.index != "Albertslund"]

    x = df[variables+[c]].values

    x = StandardScaler().fit_transform(x)

    df_std = pd.DataFrame(data = x, columns = variables+[c])

    pca = PCA(n_components=4)

    PC = pca.fit_transform(df_std)

    components = pca.components_

    PC_list = ["PC"+str(i) for i in range(1,5)]

    PC_4D = pd.DataFrame(components,columns=variables+[c],index=PC_list).T

    ## Create plot ###################################################################################

    #Creating format fitting for bokeh
    source = ColumnDataSource(PC_4D)

    #Creating a list for the x-axis on the bar-chart. 
    x_range = [str(h) for h in source.data['index']]

    #Generating 14 colors for the different focus crimes  
    colors = bokeh.palettes.Category20[18][0::2]

    bar ={} # to store vbars

    #Creating figure frame with title, x -and y labels and disabling toolbar. 
    p[c] = figure(x_range=FactorRange(factors=x_range),
               title=f'{change_dict[c]} Principal Component Coefficients',
               x_axis_label='Variables',
               y_axis_label='Coefficients',
               plot_width=500,plot_height=400,
               toolbar_location = None)


    PCs = ["PC1","PC2","PC3","PC4"]

    #Generating barcharts to p 
    for indx,i in enumerate(PCs):
        bar[i] = p[c].vbar("index",  top=i, source= source, width=0.5, 
                     fill_alpha=0.5,line_color=None,
                     color=colors[indx],visible=False)
    
    p[c].xaxis.major_label_orientation = math.pi/4

    items = [(i,[bar[i]]) for i in PCs] #Creating list of tuples with focus crime name and belonging bar chart.
    legend = Legend(items=items,location=(0,125),) # Creating legends with 'items'
    p[c].add_layout(legend,"right") # Adding legends to ´p´ and setting location. 

    p[c].legend.click_policy="hide"
    p[c].legend.title = 'Show PC'
    
    
show(row(p["speed_change"],p["passenger_change"]))
In [55]:
#checkbox_group = CheckboxGroup(
#      labels=variables, active=[0, 1])

checkbox_group = CheckboxGroup(
        labels=variables, active=[0, 1])

show(checkbox_group)

Click here to see the answer.

Now one last thing and we are done. Actually, the Principal Components (PC) are axes in a coordinate system and the coefficients shown above control how low (-) or how high (+) they land on the PC's. The PC's ensure that there are as much variation as possible which means we can make it possible to see what makes the municipalities increase in speed and decrease in passengers apart. The plots are color coded so the darker the green the higher the change.

Try to hover over the darkest circle on the plot to the left and you will see that it is Rødovre with the highest increase in speed and that it scores positive on PC1 and negatively on PC3. Then try to hover over the lightest circle and you will see that it is Høje-Taastrup with the lowest increase in speed that is negative on PC1 and positve on PC3.

Now notice how the colors generally are light on one side of the dashed line and generally dark on the other side of the dashed line. This means that we simply by looking at where they land on the PC's can determine if the increase in speed or decrease in passengers is likely to become high or low!

In [48]:
############ prepare data ##############################################################################################
from branca.colormap import linear
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from bokeh.layouts import layout, gridplot, row, column, widgetbox
%matplotlib inline


variables = ["log_Density","DagNatRatio","pct_unemployed","pct_highly_educated",
             "pct_not_vestern","income","median_age",
            ]

change_variables = ["speed_change","passenger_change"]

change_dict = {"speed_change":"Speed increase (%)","passenger_change":"Passenger decrease (%)"}

sel_mun = [
           "Albertslund",
           "Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
                           "Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
                           "Rødovre","Tårnby","Vallensbæk"]

path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/'

df = pd.read_csv(os.path.join(path,"allVariables.csv"),sep=",",index_col=0)

df["log_Density"] = np.log(df["Density"])

#df = df[df.index != "Albertslund"]

for c in change_variables:
    

    x = df[variables.copy()+[c]].copy().values

    x = StandardScaler().fit_transform(x)

    df_std = pd.DataFrame(data = x, columns = variables.copy()+[c],index=sel_mun)

    pca = PCA(n_components=4)

    PC_color = pca.fit_transform(df_std.copy())

    df_color = pd.DataFrame(PC_color,index=sel_mun,columns=["PC1","PC2","PC3","PC4"])


    colormap = linear.YlGn_09.scale(df[c].min(),df[c].max())


    df_color[c] = df[c]*100

    df_color["color"] = [colormap(df.loc[m,c]) for m in sel_mun]

    #Create overview of values being above or below mean
    df_above_below = (df_std > 0) 

    for cl in df_above_below.columns:
        df_above_below[cl] = ["Above" if cl else "Below" for cl in df_above_below[cl]]

    df_above_below = df_above_below.iloc[:,:-1]
    df_color = df_color.reset_index()

    df_color = df_color.join(df_above_below,on="index")

    df_color = df_color.set_index("index")


    ########### Create figure #########################################################################################################
    source = ColumnDataSource(data = df_color)

    
    TOOLTIPS = [("Municipality: ", "@index"),
                 ("% change", "@{%s}" %c),
                 ("log Density","@log_Density"),
                 ("Day/Night","@DagNatRatio"),
                 ("Unemployment","@pct_unemployed"),
                 ("Education","@pct_highly_educated"),
                 ("Non-vestern","@pct_not_vestern"),
                 ("Income", "@income"),
                 ("Median age","@median_age")]


    # create figure
    p[c] = figure(plot_height = 500, plot_width = 500,
        title = f'{change_dict[c]}: PC1 and PC3',
               x_axis_label='Principal Component 1',
               y_axis_label='Principal Component 3',
               background_fill_color = "white",background_fill_alpha = 0.8, 
               border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
               #y_axis_type="log"
               toolbar_location = None,
                y_range=(-2.5,2.7)
              )


    # Create plot
    #s.circle(x = "PC3", y = 'PC4', size=15,source=source,color="speed_color",line_alpha=1,line_color="black")
    p[c].circle(x = "PC1", y = 'PC3', size=15,source=source,color="color",line_alpha=1,line_color="black")

    p[c].line([-2.5, 4.5], [-2, 2.1], line_color="grey",line_dash="dashed")


show(row(p["speed_change"],p["passenger_change"]))
    

So do you remember Albertslund seemed to be an outlier when doing linear regression? Let us see if we now can explain why it has a high change in passengers. If you hover over the darkest circle on the right plot you will find Alberslund. Albertslund almost lies on 0 at PC1 while it is the lowest point on PC3. So PC3 alone can explain why Albertslund have a high decrease in passengers. Let's look at its values.

Wait a little. They are in fact identical to the municipality described above so you have actually already explained why Albertslund has a high decrease in passengers.

In [ ]: